#  
#  WordCompounder --- Executable file name: GoMusubi
#  Copyright(C) 2021 Kaoru Sagara and Syugo Nakamura 
#  This software is released under any of the GPL (see the file GPL), the LGPL(see the file LGPL), or the BSD License (see the file BSD).

from abc import ABCMeta, abstractmethod
import copy
import re
from unicodedata import normalize
from components.exceptions import SaveFailureException


class Compounder:
  __metaclass__ = ABCMeta

  def __init__(self):
    self._compound_list = []

  @abstractmethod
  def process(self):
    """
    合成語生成を実行する。
    """
    pass

  def save_result(self, filepath, encoding='utf-8', keep_compound=False):
    """
    合成語を用いた形態素解析の結果をテキスト形式で保存する。

    Args:
        filepath (str): 保存先ファイルパス
        encoding (str, optional): エンコーディング形式. Defaults to 'utf-8'.
        keep_compound (bool, optional): 合成語単位を優先する場合はTrue, 元の形態素解析の単位を優先する場合はFalse. Defaults to False.
    """
    mecab_parser = self.get_mecab_parser()

    result_list = mecab_parser.result_list
    input_text = mecab_parser.input_text
    mecab_parser = copy.copy(mecab_parser)

    output_text = ""
    compound_features = ['名詞', '一般', '合成語']

    all_result_list = []

    if keep_compound:
      # 結合語の単位を優先し、残りを形態素解析する。

      # 合成語が含まれる領域を探す
      match_list = []
      for compound in self.compound_list:
        match = [(g.start(), len(g.group()), g.group()) for g in re.finditer(re.escape(compound), input_text)]
        match_list.extend(match)

      match_list = sorted(match_list, key=lambda l:l[0], reverse=False)

      # 領域の重複を除去する
      unique_match_list = []
      last_end_pos = 0
      for match in match_list:
        begin_pos, length, _ = match
        if begin_pos >= last_end_pos:
          unique_match_list.append(match)
          last_end_pos = begin_pos + length

      # 結合語を優先した、形態素解析を生成する。
      last_end_pos = 0
      for match in unique_match_list:
        start_pos, length, compound = match
        non_compound_text = input_text[last_end_pos:start_pos]
        last_end_pos = start_pos + length
        
        if non_compound_text:
          result_list = mecab_parser.parse(non_compound_text, normalize=False)
          all_result_list.extend(result_list)
        
        all_result_list.append([compound, compound_features])
      
      non_compound_text = input_text[last_end_pos:]
      if non_compound_text:
        result_list = mecab_parser.parse(non_compound_text, normalize=False)
        all_result_list.extend(result_list)
      
    else:
      # 元の形態素解析の区切りを優先する。
      surface_history_list = []
      features_history_list = []
      surface_type = None

      for result in mecab_parser.result_list:
        surface, features = result

        if len(surface_history_list) == 0:
          surface_history_list = [surface]
          features_history_list = [features]
          is_first = True
        else:
          is_first = False
        
        surface_history_concat = ''.join(surface_history_list)

        if surface_history_concat in self._compound_list:
          # 合成語と同じ
          surface_type = 'compound'
        else:
          surface_type = 'non_compound'
          
          for compound in self.compound_list:
            if surface_history_concat and compound.startswith(surface_history_concat):
              surface_type = 'partial'
              break
            else:
              pass
          
        if surface_type == 'compound':
          # 合成語
          all_result_list.append([surface_history_concat, compound_features])
          surface_history_list.clear()
          features_history_list.clear()
          if not is_first:
            all_result_list.append([surface, features])
        elif surface_type == 'partial':
          # 合成語の一部の可能性あり
          if not is_first:
            surface_history_list.append(surface)
            features_history_list.append(features)
        elif surface_type == 'non_compound':
          # やっぱり合成語ではなかった
          for surface_hist, features_hist in zip(surface_history_list, features_history_list):
            all_result_list.append([surface_hist, features_hist])
          surface_history_list.clear()
          features_history_list.clear()

    output_text = mecab_parser.make_output(all_result_list, fmt='default')

    try:
      with open(filepath, mode='w', encoding=encoding, errors='ignore') as f:
        f.write(output_text)
    except PermissionError:
      raise SaveFailureException("ファイルに書き込みできません。")

  def save_compound_word(self, filepath, encoding='utf-8'):
    """
    生成した合成語のリストをテキスト形式で保存する。
    Args:
        filepath (str): 保存先ファイルパス
        encoding (str, optional): エンコーディング形式. Defaults to 'utf-8'.
    """
    output_text = "\n".join(self.compound_list)
    try:
      with open(filepath, mode='w', encoding=encoding, errors='ignore') as f:
        f.write(output_text)
    except PermissionError:
      raise SaveFailureException("ファイルに書き込みできません。")


  @property
  def compound_list(self):
    """
    合成語のリストを返す。

    Returns:
        [type]: [description]
    """
    return self._compound_list

  @abstractmethod
  def get_mecab_parser(self):
    """
    Mecabパーサーを返す。
    """
    raise NotImplementedError

  @property
  def mecab_parser(self):
    return self.get_mecab_parser()
